Model, analyse and visualise network data using R.
In this take home exercise, I will reveal the patterns of community interactions of the city of Engagement, Ohio USA by using social network analysis approach.
Before we get started, it is important to ensure that the R packages have been installed. If yes, we will load the R packages. If they have yet to be installed, we will install the R packages and load them onto R environment.
The chunk code below will do the trick.
packages = c('igraph', 'tidygraph',
'ggraph', 'visNetwork',
'lubridate', 'clock',
'tidyverse', 'ggmap','zoo')
for(p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}
network_nodes <- read_csv("data/Participants.csv")
network_edges <- read_csv("data/SocialNetwork.csv")
glimpse(network_edges)
Rows: 1,048,575
Columns: 3
$ timestamp <dttm> 2022-03-01, 2022-03-01, 2022-03-01, 2022-03-01, 2…
$ source <dbl> 173, 178, 178, 180, 183, 183, 185, 185, 186, 186, …
$ target <dbl> 180, 183, 185, 173, 178, 185, 178, 183, 187, 204, …
network_nodes <- network_nodes %>%
mutate(participantId = participantId +1)
network_edges <- network_edges %>%
mutate(source = source +1) %>%
mutate(target = target +1)
network_edges <- network_edges %>%
mutate(Weekday = wday(timestamp,
label = TRUE,
abbr = FALSE)) %>%
mutate(YearMonth = format(timestamp,'%Y-%m'))
network_edges_aggregated <- network_edges %>%
filter(YearMonth == "2022-03") %>%
group_by(source, target) %>%
summarise(Weight = n()) %>%
filter(source!=target) %>%
ungroup()
hist(network_edges_aggregated$Weight)
summary(network_edges_aggregated$Weight)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 6.00 13.00 14.24 20.00 31.00
From the distribution shown above, we can choose the interactions of more than 20 as close relationship of participants.
network_edges_aggregated <- network_edges %>%
filter(YearMonth == "2022-03") %>%
group_by(source, target) %>%
summarise(Weight = n()) %>%
filter(source!=target) %>%
filter(Weight > 20) %>%
ungroup() %>%
mutate_at(vars(source, target), as.character)
nodes <-c(network_edges_aggregated$source,network_edges_aggregated$target)
nodes <- data.frame(participantId=unlist(nodes, use.names = FALSE))
nodes <- distinct(nodes, participantId)
network_nodes <- merge(x=network_nodes,y=nodes,by="participantId") %>%
mutate(participantId = as.character(participantId))
#nodes <- subset(network_edges_aggregated,select=c(source))
network_graph <- tbl_graph(nodes = network_nodes,
edges = network_edges_aggregated,
directed = TRUE)
network_graph
# A tbl_graph: 816 nodes and 3002 edges
#
# A directed simple graph with 20 components
#
# Node Data: 816 × 7 (active)
participantId householdSize haveKids age educationLevel
<chr> <dbl> <lgl> <dbl> <chr>
1 2 3 TRUE 25 HighSchoolOrC…
2 3 3 TRUE 35 HighSchoolOrC…
3 4 3 TRUE 21 HighSchoolOrC…
4 5 3 TRUE 43 Bachelors
5 6 3 TRUE 32 HighSchoolOrC…
6 7 3 TRUE 26 HighSchoolOrC…
# … with 810 more rows, and 2 more variables: interestGroup <chr>,
# joviality <dbl>
#
# Edge Data: 3,002 × 3
from to Weight
<int> <int> <int>
1 1 51 25
2 1 695 29
3 2 177 23
# … with 2,999 more rows
network_graph %>%
activate(edges) %>%
arrange(desc(Weight))
# A tbl_graph: 816 nodes and 3002 edges
#
# A directed simple graph with 20 components
#
# Edge Data: 3,002 × 3 (active)
from to Weight
<int> <int> <int>
1 155 161 31
2 160 164 31
3 160 166 31
4 161 155 31
5 164 160 31
6 164 166 31
# … with 2,996 more rows
#
# Node Data: 816 × 7
participantId householdSize haveKids age educationLevel
<chr> <dbl> <lgl> <dbl> <chr>
1 2 3 TRUE 25 HighSchoolOrC…
2 3 3 TRUE 35 HighSchoolOrC…
3 4 3 TRUE 21 HighSchoolOrC…
# … with 813 more rows, and 2 more variables: interestGroup <chr>,
# joviality <dbl>
g <- ggraph(network_graph) +
geom_edge_link(aes()) +
geom_node_point(aes())+
labs(title = "Network of Engagemnet")
g + theme_graph()
g <- ggraph(network_graph,
layout = "nicely") +
geom_edge_link(aes()) +
geom_node_point(aes(colour = educationLevel,
size = 1))+
labs(title = "Network of Engagemnet with different Education Level")
g + theme_graph()
g <- ggraph(network_graph,
layout = "nicely") +
geom_edge_link(aes(width=Weight),
alpha=0.2) +
scale_edge_width(range = c(0.1, 5)) +
geom_node_point(aes(colour = educationLevel),
size = 1)+
labs(title = "Network of Engagemnet with different Education Level")
g + theme_graph()
set_graph_style()
g <- ggraph(network_graph,
layout = "nicely") +
geom_edge_link(aes(width=Weight),
alpha=0.2) +
scale_edge_width(range = c(0.1, 5)) +
geom_node_point(aes(colour = educationLevel),
size = 2)+
labs(title = "Network of Engagemnet")
g + facet_nodes(~educationLevel)+
th_foreground(foreground = "grey80",
border = TRUE) +
theme(legend.position = 'bottom')
g <- network_graph %>%
mutate(betweenness_centrality = centrality_betweenness()) %>%
ggraph(layout = "fr") +
geom_edge_link(aes(width=Weight),
alpha=0.2) +
scale_edge_width(range = c(0.1, 5)) +
geom_node_point(aes(colour = educationLevel,
size=betweenness_centrality))+
labs(title = "Network of Engagemnet with betweenness centrality")
g + theme_graph()
g <- network_graph %>%
ggraph(layout = "fr") +
geom_edge_link(aes(width=Weight),
alpha=0.2) +
scale_edge_width(range = c(0.1, 5)) +
geom_node_point(aes(colour = educationLevel,
size = centrality_betweenness()))+
labs(title = "Network of Engagemnet with betweenness centrality")
g + theme_graph()
g <- network_graph %>%
mutate(community = as.factor(group_edge_betweenness(weights = Weight, directed = TRUE))) %>%
ggraph(layout = "fr") +
geom_edge_link(aes(width=Weight),
alpha=0.2) +
scale_edge_width(range = c(0.1, 5)) +
geom_node_point(aes(colour = community))
g + theme_graph()+
labs(title = "Network of Engagemnet with community")
From the plots shown above, we can conclude that participants with low education level tend to have less interactions with other participants. People with bachelor degree or High school degree are more likely to have higher betweenness centrality.
Processing the data
network_edges_aggregated <- network_edges %>%
mutate(source = as.character(source)) %>%
mutate(target = as.character(target)) %>%
left_join(network_nodes, by = c("source" = "participantId")) %>%
left_join(network_nodes, by = c("target" = "participantId")) %>%
filter(YearMonth == "2022-03") %>%
group_by(source, target) %>%
summarise(weight = n()) %>%
filter(source!=target) %>%
filter(weight > 10) %>%
ungroup()
To show the network graph interactively, we can use visNetwork. In this case the number of nodes is still too big to clearly show the network interactively. We can select some of the nodes and show their network in further steps.
visNetwork(network_nodes,
network_edges_aggregated)